const char* idct_kernel_clclh = 
"#define DCTSIZE 8\n"
"#define DCTSIZE2 64\n"
"#define LOADSRC(i, src) convert_float8(vload8(i, src))*vload8(i, table)\n"
"__kernel void idct_float(__global short* input, __global unsigned char* output, __global const float* dequantilize_table, __global const int* order, int blocks_per_mcu, size_t totalblocks)\n"
"{\n"
"    int blkn = get_global_id(0);\n"
"    if (blkn < totalblocks)\n"
"    {\n"
"        __global short* src = input + DCTSIZE2*blkn;\n"
"        __global unsigned char* outptr = output + DCTSIZE2*blkn;\n"
"        __global const float* table = dequantilize_table + order[blkn % blocks_per_mcu]*DCTSIZE2;\n"
"        float8 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;\n"
"        float8 w0, w1, w2, w3, w4, w5, w6, w7;\n"
"        float8 tmp10, tmp11, tmp12, tmp13;\n"
"        float8 z5, z10, z11, z12, z13;\n"
"        tmp0 = LOADSRC(0, src);\n"
"        tmp1 = LOADSRC(2, src);\n"
"        tmp2 = LOADSRC(4, src);\n"
"        tmp3 = LOADSRC(6, src);\n"
"        tmp10 = tmp0 + tmp2;	/* phase 3 */\n"
"        tmp11 = tmp0 - tmp2;\n"
"        \n"
"        tmp13 = tmp1 + tmp3;	/* phases 5-3 */\n"
"        tmp12 = (tmp1 - tmp3) * float8(1.414213562) - tmp13; /* 2*c4 */\n"
"        \n"
"        tmp0 = tmp10 + tmp13;	/* phase 2 */\n"
"        tmp3 = tmp10 - tmp13;\n"
"        tmp1 = tmp11 + tmp12;\n"
"        tmp2 = tmp11 - tmp12;\n"
"        \n"
"        tmp4 = LOADSRC(1, src);\n"
"        tmp5 = LOADSRC(3, src);\n"
"        tmp6 = LOADSRC(5, src);\n"
"        tmp7 = LOADSRC(7, src);\n"
"        \n"
"        z13 = tmp6 + tmp5;		/* phase 6 */\n"
"        z10 = tmp6 - tmp5;\n"
"        z11 = tmp4 + tmp7;\n"
"        z12 = tmp4 - tmp7;\n"
"        \n"
"        tmp7 = z11 + z13;		/* phase 5 */\n"
"        tmp11 = (z11 - z13) * float8(1.414213562); /* 2*c4 */\n"
"        \n"
"        z5 = (z10 + z12) * float8(1.847759065); /* 2*c2 */\n"
"        tmp10 = float8(1.082392200) * z12 - z5; /* 2*(c2-c6) */\n"
"        tmp12 = float8(-2.613125930) * z10 + z5; /* -2*(c2+c6) */\n"
"        \n"
"        tmp6 = tmp12 - tmp7;	/* phase 2 */\n"
"        tmp5 = tmp11 - tmp6;\n"
"        tmp4 = tmp10 + tmp5;\n"
"        \n"
"        tmp0 = tmp0 + tmp7;\n"
"        tmp7 = tmp0 - float8(2)*tmp7;\n"
"        tmp1 = tmp1 + tmp6;\n"
"        tmp6 = tmp1 - float8(2)*tmp6;\n"
"        tmp2 = tmp2 + tmp5;\n"
"        tmp5 = tmp2 - float8(2)*tmp5;\n"
"        tmp4 = tmp3 + tmp4;\n"
"        tmp3 = float8(2)*tmp3 - tmp4;\n"
"        /*Cross*/\n"
"#define TRANS(w, i) w##i = (float8)(tmp0.s##i, tmp1.s##i, tmp2.s##i, tmp3.s##i, tmp4.s##i, tmp5.s##i, tmp6.s##i, tmp7.s##i)\n"
"        TRANS(w, 0);\n"
"        TRANS(w, 1);\n"
"        TRANS(w, 2);\n"
"        TRANS(w, 3);\n"
"        TRANS(w, 4);\n"
"        TRANS(w, 5);\n"
"        TRANS(w, 6);\n"
"        TRANS(w, 7);\n"
"#undef TRANS\n"
"        \n"
"        tmp10 = w0 + w4;\n"
"        tmp11 = w0 - w4;\n"
"        \n"
"        tmp13 = w2 + w6;\n"
"        tmp12 = (w2 - w6) * float8(1.414213562) - tmp13;\n"
"        \n"
"        tmp0 = tmp10 + tmp13;\n"
"        tmp3 = tmp10 - tmp13;\n"
"        tmp1 = tmp11 + tmp12;\n"
"        tmp2 = tmp11 - tmp12;\n"
"        \n"
"        z13 = w5 + w3;\n"
"        z10 = w5 - w3;\n"
"        z11 = w1 + w7;\n"
"        z12 = w1 - w7;\n"
"        \n"
"        tmp7 = z11 + z13;\n"
"        tmp11 = (z11 - z13) * float8(1.414213562);\n"
"        \n"
"        z5 = (z10 + z12) * float8(1.847759065); /* 2*c2 */\n"
"        tmp10 = float8(1.082392200) * z12 - z5; /* 2*(c2-c6) */\n"
"        tmp12 = float8(-2.613125930) * z10 + z5; /* -2*(c2+c6) */\n"
"        \n"
"        tmp6 = tmp12 - tmp7;\n"
"        tmp5 = tmp11 - tmp6;\n"
"        tmp4 = tmp10 + tmp5;\n"
"        \n"
"        tmp0 = tmp0 + tmp7;\n"
"        tmp7 = tmp0 - float8(2)*tmp7;\n"
"        tmp1 = tmp1 + tmp6;\n"
"        tmp6 = tmp1 - float8(2)*tmp6;\n"
"        tmp2 = tmp2 + tmp5;\n"
"        tmp5 = tmp2 - float8(2)*tmp5;\n"
"        tmp4 = tmp3 + tmp4;\n"
"        tmp3 = float8(2)*tmp3 - tmp4;\n"
"        /*Cross*/\n"
"#define TRANS(w, i) w##i = (float8)(tmp0.s##i, tmp1.s##i, tmp2.s##i, tmp3.s##i, tmp4.s##i, tmp5.s##i, tmp6.s##i, tmp7.s##i)\n"
"        TRANS(w, 0);\n"
"        TRANS(w, 1);\n"
"        TRANS(w, 2);\n"
"        TRANS(w, 3);\n"
"        TRANS(w, 4);\n"
"        TRANS(w, 5);\n"
"        TRANS(w, 6);\n"
"        TRANS(w, 7);\n"
"#undef TRANS\n"
"        /* Final output stage: scale down by a factor of 8 and range-limit */\n"
"#define RESULT(t) convert_uchar8(clamp((t)/float8(8)+float8(128), float8(0), float8(255)))\n"
"        vstore8(RESULT(w0), 0, outptr);\n"
"        vstore8(RESULT(w7), 7, outptr);\n"
"        vstore8(RESULT(w1), 1, outptr);\n"
"        vstore8(RESULT(w6), 6, outptr);\n"
"        vstore8(RESULT(w2), 2, outptr);\n"
"        vstore8(RESULT(w5), 5, outptr);\n"
"        vstore8(RESULT(w4), 4, outptr);\n"
"        vstore8(RESULT(w3), 3, outptr);\n"
"#undef RESULT\n"
"    }\n"
"}\n"
;
const char* yuv_rgb_clclh = 
"#define DCTSIZE2 64\n"
"#define DCTSIZE 8\n"
"struct ComponentInfo\n"
"{\n"
"    int max_x_sample;\n"
"    int max_y_sample;\n"
"    int YW;\n"
"    int YH;\n"
"    int UW;\n"
"    int UH;\n"
"    int UOffset;\n"
"    int VW;\n"
"    int VH;\n"
"    int VOffset;\n"
"    int blocksInMCU;\n"
"    int MCU_Per_Row;\n"
"};\n"
"__kernel void yuv_rgb(__global unsigned char* yuvbuffer, __global unsigned char* rgba, struct ComponentInfo info, int output_stride)\n"
"{\n"
"    int x = get_global_id(0);\n"
"    int y_origin = get_global_id(1);\n"
"    int yoffset = y_origin % DCTSIZE;\n"
"    int y = y_origin/DCTSIZE;\n"
"    int mcux = x/info.max_x_sample;\n"
"    int mcuy = y/info.max_y_sample;\n"
"    __global unsigned char* output = rgba + 3*(output_stride*y_origin + x*DCTSIZE);\n"
"    __global unsigned char* basic = yuvbuffer + DCTSIZE2*info.blocksInMCU*(info.MCU_Per_Row*mcuy + mcux);\n"
"    __global unsigned char* Y = basic + DCTSIZE2*((x%info.YW) + 2*(y%info.YH)) + DCTSIZE*yoffset;\n"
"    __global unsigned char* U = basic + DCTSIZE2*((x%info.UW) + 2*(y%info.UH)+info.UOffset) + DCTSIZE*yoffset;\n"
"    __global unsigned char* V = basic + DCTSIZE2*((x%info.VW) + 2*(y%info.VH)+info.VOffset) + DCTSIZE*yoffset;\n"
"    float8 yy = convert_float8(vload8(0, Y));\n"
"    float8 uu = convert_float8(vload8(0, U)) - float8(128);\n"
"    float8 vv = convert_float8(vload8(0, V)) - float8(128);\n"
"    uchar8 r, g, b;\n"
"    uchar8 first, second, third;\n"
"#define RESULT(x) convert_uchar8(clamp(x, float8(0), float8(255)))\n"
"    r = RESULT(yy + float8(1.40200)*vv);\n"
"    g = RESULT(yy - float8(0.34414)*uu - float8(0.71414)*vv);\n"
"    b = RESULT(yy + float8(1.77200)*uu);\n"
"#undef RESULT\n"
"    first = (uchar8)(r.s0, g.s0, b.s0, r.s1, g.s1, b.s1, r.s2, g.s2);\n"
"    second= (uchar8)(b.s2, r.s3, g.s3, b.s3, r.s4, g.s4, b.s4, r.s5);\n"
"    third = (uchar8)(g.s5, b.s5, r.s6, g.s6, b.s6, r.s7, g.s7, b.s7);\n"
"    vstore8(first, 0, output);\n"
"    vstore8(second, 1, output);\n"
"    vstore8(third, 2, output);\n"
"}\n"
;
